import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
data=pd.read_csv("salesPredictionDataset.csv")
print(data.head())
TV Radio Newspaper Sales 0 230.1 37.8 69.2 22.1 1 44.5 39.3 45.1 10.4 2 17.2 45.9 69.3 12.0 3 151.5 41.3 58.5 16.5 4 180.8 10.8 58.4 17.9
print(data.isnull().sum())
TV 0 Radio 0 Newspaper 0 Sales 0 dtype: int64
import plotly.express as px
import plotly.graph_objects as go
figure = px.scatter(data_frame = data, x="Sales",
y="TV", size="TV", trendline="ols")
figure.show()
figure=px.scatter(data_frame=data, x="Sales", y="Newspaper", size="Newspaper", trendline="ols")
figure.show()
figure = px.scatter(data_frame = data, x="Sales", y="Radio", size="Radio", trendline="ols")
figure.show()
correlation = data.corr()
print(correlation["Sales"].sort_values(ascending=False))
Sales 1.000000 TV 0.901208 Radio 0.349631 Newspaper 0.157960 Name: Sales, dtype: float64
x = np.array(data.drop(["Sales"], 1))
y = np.array(data["Sales"])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
C:\Users\Prakhar\AppData\Local\Temp/ipykernel_1576/1754818733.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
model = LinearRegression()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))
0.9059011844150826
#features = [[TV, Radio, Newspaper]]
features = np.array([[5600006.1, 37.8, 69.2]])
print(model.predict(features))
[305261.07915204]